import dalex as dx
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from keras.models import Sequential
from keras.layers import Dense
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
data = pd.read_csv('hotel_bookings.csv')
data.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | NaN | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | NaN | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 |
5 rows × 32 columns
# this time all variables will be used as it was possible to use machine with better processor
data = data[['is_canceled', 'lead_time', 'arrival_date_year', 'adults', 'children', 'babies', 'booking_changes',
'previous_cancellations', 'is_repeated_guest', 'arrival_date_month', 'deposit_type', 'customer_type']]
categorical_features = ['arrival_date_month', 'deposit_type', 'customer_type']
numeric_features = ['lead_time', 'arrival_date_year', 'adults', 'children', 'babies', 'booking_changes',
'previous_cancellations', 'is_repeated_guest']
data = data.dropna()
X, y = data.loc[:, data.columns != 'is_canceled'], data[['is_canceled']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
observation = pd.DataFrame({'lead_time': [82.0],
'arrival_date_year': [2015.0],
'adults': [2.0],
'children': [0.0],
'babies': [0.0],
'booking_changes': [0.0],
'previous_cancellations': [0.0],
'is_repeated_guest': [0.0],
'arrival_date_month': ['July'],
'deposit_type': ['No Deposit'],
'customer_type': ['Transient']},
index = ['observation'])
def f1_metric(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
recall = true_positives / (possible_positives + K.epsilon())
f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
return f1_val
categorical_transformer = OneHotEncoder()
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer(
transformers=[
('categorical', categorical_transformer, categorical_features),
('numeric', numeric_transformer, numeric_features)
])
def keras_classifier_wrapper():
model = Sequential()
model.add(Dense(100, input_dim=X_train_nn.shape[1], activation='sigmoid'))
model.add(Dense(50, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_metric])
return model
model = KerasClassifier(keras_classifier_wrapper, epochs=100, batch_size=200)
callbacks = [EarlyStopping(monitor='f1_metric', mode='max', restore_best_weights=True, patience=10)]
nn = Pipeline(steps = [
('preprocessor', preprocessor),
('neural_network', model)
])
nn.fit(X_train, y_train, neural_network__callbacks=callbacks)
Epoch 1/100 538/538 [==============================] - 2s 3ms/step - loss: 0.5420 - accuracy: 0.7291 - f1_metric: 0.4596 Epoch 2/100 538/538 [==============================] - 1s 3ms/step - loss: 0.4865 - accuracy: 0.7683 - f1_metric: 0.5776 Epoch 3/100 538/538 [==============================] - 1s 3ms/step - loss: 0.4829 - accuracy: 0.7691 - f1_metric: 0.5777 Epoch 4/100 538/538 [==============================] - 1s 3ms/step - loss: 0.4797 - accuracy: 0.7695 - f1_metric: 0.5779 Epoch 5/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4774 - accuracy: 0.7708 - f1_metric: 0.5814 Epoch 6/100 538/538 [==============================] - 1s 3ms/step - loss: 0.4752 - accuracy: 0.7714 - f1_metric: 0.5844 Epoch 7/100 538/538 [==============================] - 1s 3ms/step - loss: 0.4734 - accuracy: 0.7718 - f1_metric: 0.5873 Epoch 8/100 538/538 [==============================] - 1s 3ms/step - loss: 0.4706 - accuracy: 0.7726 - f1_metric: 0.5912 Epoch 9/100 538/538 [==============================] - 1s 3ms/step - loss: 0.4678 - accuracy: 0.7737 - f1_metric: 0.5971 Epoch 10/100 538/538 [==============================] - 1s 3ms/step - loss: 0.4663 - accuracy: 0.7727 - f1_metric: 0.5971 Epoch 11/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4647 - accuracy: 0.7732 - f1_metric: 0.6020 Epoch 12/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4634 - accuracy: 0.7729 - f1_metric: 0.6019 Epoch 13/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4626 - accuracy: 0.7729 - f1_metric: 0.6029 Epoch 14/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4621 - accuracy: 0.7734 - f1_metric: 0.6043 Epoch 15/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4610 - accuracy: 0.7731 - f1_metric: 0.6034 Epoch 16/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4603 - accuracy: 0.7731 - f1_metric: 0.6029 Epoch 17/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4593 - accuracy: 0.7736 - f1_metric: 0.6028 Epoch 18/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4584 - accuracy: 0.7738 - f1_metric: 0.6047 Epoch 19/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4582 - accuracy: 0.7743 - f1_metric: 0.6040 Epoch 20/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4568 - accuracy: 0.7750 - f1_metric: 0.6048 Epoch 21/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4563 - accuracy: 0.7744 - f1_metric: 0.6035 Epoch 22/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4557 - accuracy: 0.7747 - f1_metric: 0.6039 Epoch 23/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4551 - accuracy: 0.7748 - f1_metric: 0.6031 Epoch 24/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4539 - accuracy: 0.7753 - f1_metric: 0.6043 Epoch 25/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4536 - accuracy: 0.7753 - f1_metric: 0.6035 Epoch 26/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4529 - accuracy: 0.7751 - f1_metric: 0.6031 Epoch 27/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4524 - accuracy: 0.7756 - f1_metric: 0.6033 Epoch 28/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4518 - accuracy: 0.7753 - f1_metric: 0.6021 Epoch 29/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4512 - accuracy: 0.7764 - f1_metric: 0.6046 Epoch 30/100 538/538 [==============================] - 2s 3ms/step - loss: 0.4506 - accuracy: 0.7761 - f1_metric: 0.6040
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('categorical',
OneHotEncoder(),
['arrival_date_month',
'deposit_type',
'customer_type']),
('numeric', StandardScaler(),
['lead_time',
'arrival_date_year',
'adults', 'children',
'babies', 'booking_changes',
'previous_cancellations',
'is_repeated_guest'])])),
('neural_network',
<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001E9A13FE3A0>)])
y_pred = np.where(np.transpose(nn.predict_proba(X_test))[0] > 0.5, 0,1)
print(f'f1-score: {f1_score(y_test, y_pred)}')
# parameters were not tuned, so results are not the best
f1-score: 0.6001794258373205
nn.predict_proba(observation) # prediction(there will be no cancelation)
array([[0.72625995, 0.27374008]], dtype=float32)
exp_nn = dx.Explainer(nn, X_train, y_train, label='neural_network')
Preparation of a new explainer is initiated -> data : 107447 rows 11 cols -> target variable : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray. -> target variable : 107447 values -> model_class : tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier (default) -> label : neural_network -> predict function : <function yhat_proba_default at 0x000001E98FA0F820> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.00479, mean = 0.358, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.984, mean = 0.012, max = 0.992 -> model_info : package sklearn A new explainer has been created!
nn_profile = exp_nn.predict_profile(new_observation = observation)
# plot Ceteris Paribus profile
nn_profile.plot(variables = ['lead_time', 'children', 'babies', 'booking_changes'])
# in this model chidren and babies variables did not matter at all
# bigger lead time(ariund 95 days) increased probability of cancelation, so did lead time > 400 days,
# when lead_time values are between those values or are smaller than 95 days probability of not coming drops
# number of children greater than 1 makes cancelation more possible(and 1 child makes it less possible)
# number of babies and booking changes behave same and show less probability of cancelation around value 1
# then proability grows a bit and then falls down
Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 11/11 [00:01<00:00, 7.66it/s]
forest = Pipeline(steps = [
('preprocessor', preprocessor),
('random_forest', RandomForestClassifier(max_depth=5, random_state=0))
])
forest.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('categorical',
OneHotEncoder(),
['arrival_date_month',
'deposit_type',
'customer_type']),
('numeric', StandardScaler(),
['lead_time',
'arrival_date_year',
'adults', 'children',
'babies', 'booking_changes',
'previous_cancellations',
'is_repeated_guest'])])),
('random_forest',
RandomForestClassifier(max_depth=5, random_state=0))])
print(f'f1-score: {f1_score(y_test, forest.predict(X_test))}')
f1-score: 0.5523871598500896
forest.predict(observation) # prediction(same as by neural network)
array([0], dtype=int64)
exp_forest = dx.Explainer(forest, X_train, y_train, label='random_forest')
Preparation of a new explainer is initiated -> data : 107447 rows 11 cols -> target variable : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray. -> target variable : 107447 values -> model_class : sklearn.ensemble._forest.RandomForestClassifier (default) -> label : random_forest -> predict function : <function yhat_proba_default at 0x000001E98FA0F820> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.0687, mean = 0.37, max = 0.976 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.749, mean = -0.000121, max = 0.913 -> model_info : package sklearn A new explainer has been created!
forest_profile = exp_forest.predict_profile(new_observation = observation)
forest_profile.plot(variables = ['lead_time', 'children', 'babies', 'booking_changes'])
# this model was not so dependent on lead time, as well as on number of children and babies
# number of children > 1 makes cancelation more possible(by neural network it was rather less likely)
# none zero amount of booking changes caused the decrease of the cancelation probability
# number babies didn't have bigger impact on prediction
Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 11/11 [00:00<00:00, 19.92it/s]
adaboost = Pipeline(steps = [
('preprocessor', preprocessor),
('adaboost', AdaBoostClassifier(n_estimators=100, random_state=0))
])
adaboost.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('categorical',
OneHotEncoder(),
['arrival_date_month',
'deposit_type',
'customer_type']),
('numeric', StandardScaler(),
['lead_time',
'arrival_date_year',
'adults', 'children',
'babies', 'booking_changes',
'previous_cancellations',
'is_repeated_guest'])])),
('adaboost',
AdaBoostClassifier(n_estimators=100, random_state=0))])
print(f'f1-score: {f1_score(y_test, adaboost.predict(X_test))}')
f1-score: 0.5842662168379082
adaboost.predict(observation) # prediction
array([0], dtype=int64)
exp_adaboost = dx.Explainer(adaboost, X_train, y_train, label='adaboost')
Preparation of a new explainer is initiated -> data : 107447 rows 11 cols -> target variable : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray. -> target variable : 107447 values -> model_class : sklearn.ensemble._weight_boosting.AdaBoostClassifier (default) -> label : adaboost -> predict function : <function yhat_proba_default at 0x000001E98FA0F820> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.484, mean = 0.5, max = 0.587 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.513, mean = -0.129, max = 0.514 -> model_info : package sklearn A new explainer has been created!
adaboost_profile = exp_adaboost.predict_profile(new_observation = observation)
adaboost_profile.plot(variables = ['lead_time', 'children', 'babies', 'booking_changes'])
# here only number of children > 6 mattered(making cancelation more likely)
Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 11/11 [00:00<00:00, 16.05it/s]
In general models demonstrated similar behavior(bigger lead time means increase and bigger number of booking changes means decrease of the cancelation probability) and had similar significant variables. Neural network however demonstrated a bit more complex decision changes(probility went severak times up or down depending on variables values). Of course the results would be more reliable if models were better trained(but hyperparameters tuning is out of scope for this homework).